# install.packages("devtools")
  library(devtools)
## Warning: package 'devtools' was built under R version 3.4.1
#  source("http://bioconductor.org/biocLite.R")
#  biocLite("qvalue")
#  install_github("whitlock/OutFLANK")
  library(OutFLANK)
## Loading required package: qvalue
  if(!("adegenet" %in% installed.packages())){install.packages("adegenet")}

  library(vcfR)
## 
##    *****       ***   vcfR   ***       *****
##    This is vcfR 1.5.0 
##      browseVignettes('vcfR') # Documentation
##      citation('vcfR') # Citation
##    *****       *****      *****       *****
  library(adegenet)
## Loading required package: ade4
## Warning: package 'ade4' was built under R version 3.4.1
## 
##    /// adegenet 2.0.1 is loaded ////////////
## 
##    > overview: '?adegenet'
##    > tutorials/doc/questions: 'adegenetWeb()' 
##    > bug reports/feature requests: adegenetIssues()
  library(pcadapt)
  library(lfmm)
  library(LEA)

## devtools::install_github("privefl/bigsnpr")
  library(bigsnpr)
## Loading required package: bigstatsr
  library(bigstatsr)
library(vcfR)
library(RColorBrewer)
library(ggplot2)
library(fields)
## Loading required package: spam
## Loading required package: grid
## Spam version 1.4-0 (2016-08-29) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## Loading required package: maps
vcf <- read.vcfR("evals/1236_RecomLowReg_VCFallsim1.vcf.gz")
## 
Meta line 13 read in.
## All meta lines processed.
## Character matrix gt created.
## Character matrix gt rows: 12736
## Character matrix gt cols: 1009
## skip: 0
## nrows: 12736
## row_num: 0
## 
## 
Processed variant 1000
Processed variant 2000
Processed variant 3000
Processed variant 4000
Processed variant 5000
Processed variant 6000
Processed variant 7000
Processed variant 8000
Processed variant 9000
Processed variant 10000
Processed variant 11000
Processed variant 12000
Processed variant: 12736
## All variants processed
ind0 <- read.table("evals/1236_RecomLowReg_outputIndAll.txt", header=TRUE)
muts <- read.table("evals/1236_RecomLowReg_outputMuts.txt", header=TRUE)
phen_env <-  read.table("evals/1236_RecomLowReg_outputPhenEnv.txt", header=TRUE)
sim <- system("grep recomb evals/1236_RecomLowReg_outputSim.txt", TRUE)

Count mutations that contribute at least 1% to genetic variance

muts$pa2 <- round(muts$selCoef^2*muts$freq*(1-muts$freq),3)
muts$prop=NA
muts$prop[muts$type=="m2"] <- muts$pa2[muts$type=="m2"]/sum(muts$pa2[muts$type=="m2"])
which(duplicated(muts$position))
## integer(0)
muts$count <- FALSE
muts$count[muts$prop>=0.01] <- TRUE
muts$count[muts$type=="m4" & muts$freq > 0.05] <- TRUE

muts
##    position    selCoef originGen type   freq   pa2        prop count
## 1     55166 -0.1407800         4   m2 1.0000 0.000 0.000000000 FALSE
## 2     59377  0.2438220      5994   m2 0.0010 0.000 0.000000000 FALSE
## 3     60924 -0.0487880      5392   m2 0.1425 0.000 0.000000000 FALSE
## 4     61367  0.6148460      5993   m2 0.0025 0.001 0.008928571 FALSE
## 5     63119  0.1743930      5996   m2 0.0030 0.000 0.000000000 FALSE
## 6     63357  0.0402488      5766   m2 0.0960 0.000 0.000000000 FALSE
## 7     64835  0.5254280      5997   m2 0.0005 0.000 0.000000000 FALSE
## 8     66264 -0.2040060       117   m2 1.0000 0.000 0.000000000 FALSE
## 9     74783 -0.2814420      5735   m2 0.1450 0.010 0.089285714  TRUE
## 10    76476  0.0385372      5822   m2 0.0305 0.000 0.000000000 FALSE
## 11    77162 -0.4895630      6000   m2 0.0005 0.000 0.000000000 FALSE
## 12    84721  0.2722690      3963   m2 0.2530 0.014 0.125000000  TRUE
## 13    85470  0.0705718      5995   m2 0.0035 0.000 0.000000000 FALSE
## 14    89074  0.2579440      5946   m2 0.0340 0.002 0.017857143  TRUE
## 15    89608 -0.5494260      5962   m2 0.0010 0.000 0.000000000 FALSE
## 16    92810 -1.0555100      5998   m2 0.0005 0.001 0.008928571 FALSE
## 17    95064 -0.4700100      3980   m2 0.7510 0.041 0.366071429  TRUE
## 18    95284  0.3243560      4862   m2 0.3510 0.024 0.214285714  TRUE
## 19    98753 -0.4121090      5998   m2 0.0005 0.000 0.000000000 FALSE
## 20    98973  0.1662070       417   m2 1.0000 0.000 0.000000000 FALSE
## 21   102738  0.3350400      6000   m2 0.0005 0.000 0.000000000 FALSE
## 22   106199 -0.4088850      5972   m2 0.0010 0.000 0.000000000 FALSE
## 23   110590  0.1460140      4546   m2 0.7540 0.004 0.035714286  TRUE
## 24   112536 -0.2875760      5947   m2 0.0090 0.001 0.008928571 FALSE
## 25   119926 -0.0105646      2471   m2 1.0000 0.000 0.000000000 FALSE
## 26   120860 -0.5917910      5961   m2 0.0035 0.001 0.008928571 FALSE
## 27   128050 -0.0431013      5339   m2 0.2650 0.000 0.000000000 FALSE
## 28   131674 -0.1409810      3704   m2 1.0000 0.000 0.000000000 FALSE
## 29   134613 -0.3318980      5998   m2 0.0010 0.000 0.000000000 FALSE
## 30   136416 -0.3858650      5705   m2 0.0145 0.002 0.017857143  TRUE
## 31   136835  0.2263680      4986   m2 0.2425 0.009 0.080357143  TRUE
## 32   140441  1.6589400      6000   m2 0.0005 0.001 0.008928571 FALSE
## 33   140576 -0.0571763      5888   m2 0.0120 0.000 0.000000000 FALSE
## 34   141982  0.3093680      1308   m2 1.0000 0.000 0.000000000 FALSE
## 35   143198 -0.2626850      5971   m2 0.0010 0.000 0.000000000 FALSE
## 36   149824  0.1493040      5975   m2 0.0245 0.001 0.008928571 FALSE
## 37   175000  0.8000000      5780   m4 0.9955 0.003          NA  TRUE

Set up genetic map for figures

### Color recombination regions ####
  lgs <- seq(50000, 500000, by=50000) # linkage groups recombination breakpoints 0.5
  lg_whereplot <- lgs - 25000
  (recom_rates <- as.numeric(unlist(strsplit(sim[1], " "))[-1]))
##   [1] 1.00000e-05 5.00000e-01 1.00000e-05 5.00000e-01 1.00000e-05
##   [6] 5.00000e-01 1.00000e-05 5.00000e-01 1.00000e-05 5.00000e-01
##  [11] 1.00000e-05 1.00000e-11 1.00000e-05 5.00000e-01 1.00000e-05
##  [16] 1.00000e-11 1.00000e-05 5.00000e-01 1.00000e-05 1.00000e-08
##  [21] 1.00000e-05 5.00000e-01 6.26869e-07 4.94044e-11 1.13234e-04
##  [26] 1.55358e-04 9.89536e-07 2.12766e-05 2.15634e-05 5.96114e-07
##  [31] 2.25052e-07 1.39301e-02 3.21115e-08 2.24422e-05 1.13803e-04
##  [36] 4.19961e-09 1.23114e-04 4.62895e-06 3.95739e-07 3.42904e-06
##  [41] 5.54896e-06 1.08548e-04 1.43782e-07 2.86846e-06 9.90242e-05
##  [46] 4.78288e-05 7.57642e-07 4.76437e-04 5.70054e-08 1.45567e-06
##  [51] 4.50553e-07 5.48482e-05 5.53232e-05 8.33817e-06 7.86841e-09
##  [56] 3.62774e-04 1.81594e-06 6.62984e-06 2.14141e-07 5.42497e-10
##  [61] 4.73403e-07 8.57399e-06 7.53417e-05 2.12371e-03 6.68129e-05
##  [66] 1.39354e-07 1.18159e-04 1.50988e-03 5.95875e-04 5.94761e-05
##  [71] 1.06801e-05 1.00944e-06 1.33482e-02 4.01259e-07 1.20515e-01
##  [76] 9.65473e-07 7.61952e-06 6.39211e-09 8.21413e-05 7.27847e-06
##  [81] 9.81278e-06 1.32774e-05 4.77499e-07 1.60898e-08 1.48064e-02
##  [86] 3.33800e-05 8.34835e-01 1.21952e-02 5.02130e-09 2.52336e-03
##  [91] 1.45148e-03 1.90139e-08 1.67139e-05 4.44781e-06 8.17499e-06
##  [96] 2.96218e-01 3.34819e-05 1.16501e-05 1.58825e-07 1.52615e-04
## [101] 1.87822e-03 6.48994e-07 1.69560e-06 2.60184e-04 6.75128e-06
## [106] 2.84624e-03 2.39607e-05 1.00247e-07 3.36079e-04 1.20344e-05
## [111] 1.44094e-05 1.45813e-05 2.67124e-05 4.05815e-07 2.03668e-07
## [116] 2.45710e-04 2.67158e-10 9.35007e-06 8.48453e-06 5.45712e-03
## [121] 3.49522e-04 1.20048e-03 5.00000e-01 1.00000e-05
  (recom_end <- as.integer(unlist(strsplit(sim[2], " "))[-1]))
##   [1]  50000  50001 100000 100001 150000 150001 200000 200001 250000 250001
##  [11] 270000 280000 300000 300001 320000 330000 350000 350001 370000 380000
##  [21] 400000 400001 400213 401694 401739 402182 402246 402299 402663 403305
##  [31] 403993 404452 404470 404979 405098 405986 406965 408000 408035 408725
##  [41] 408942 409115 409217 410429 410492 410949 411100 411103 411586 411850
##  [51] 412326 412665 414529 414925 415608 416082 416797 418443 418642 418719
##  [61] 419196 419421 419675 421406 421537 421851 422023 422804 423876 424257
##  [71] 425048 425130 425469 425699 425836 426605 427114 427185 427417 427518
##  [81] 427964 429578 430455 431802 432200 432753 432833 432853 433121 433122
##  [91] 433301 433349 433357 433454 433474 433800 433919 434218 434751 435511
## [101] 435782 436138 437216 437892 438289 438651 440807 442869 443146 444030
## [111] 444245 444626 445080 445780 445898 447105 447404 447950 448210 448693
## [121] 449939 450000 450001 500000
  recom <- data.frame(recom_rates, recom_end)
  recom$logrates <- log10(recom_rates)
    # plot r=0.5 as black
    # plot r=1e-11 as white
  (brks <- with(recom, seq(min(logrates), max(logrates), length.out = 10)))
##  [1] -11.00000000  -9.78648882  -8.57297763  -7.35946645  -6.14595527
##  [6]  -4.93244408  -3.71893290  -2.50542172  -1.29191053  -0.07839935
  grps <- with(recom, cut(logrates, breaks = brks, include.lowest = TRUE))
  nlevels(grps)
## [1] 9
  colfunc <- paste(colorRampPalette(colors=c("lightblue", "white", "grey90"))(9), 70, sep="")
  recom$col <- colfunc[grps]
  plot(recom$logrates, col=recom$col)

### Replace chromsome 1 with actual chromosome positions ####
  ends=c(0,lgs)
  dim(vcf@gt)
## [1] 12736  1001
  vcf@fix[,"CHROM"] <- NA
  POS <- as.numeric(vcf@fix[,"POS"])

  for (i in 1:(length(ends)-1)){
    cond <- POS >= ends[i] &  POS < ends[i+1]
    print(c(ends[i], ends[i+1], sum(cond)))
    vcf@fix[cond,"CHROM"] = i
  }
## [1]     0 50000  1303
## [1]  50000 100000   1338
## [1] 100000 150000   1272
## [1] 150000 200000   1266
## [1] 200000 250000   1327
## [1] 250000 300000   1156
## [1] 300000 350000   1290
## [1] 350000 400000   1250
## [1] 400000 450000   1266
## [1] 450000 500000   1268
  table(vcf@fix[,"CHROM"])
## 
##    1   10    2    3    4    5    6    7    8    9 
## 1303 1268 1338 1272 1266 1327 1156 1290 1250 1266
  my_ord <- order(as.numeric(vcf@fix[,"POS"]))

  vcf2 <- vcf
  vcf2 <- vcf[my_ord,]
  head(vcf2)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=20170924"
## [1] "##source=SLiM"
## [1] "##INFO=<ID=MID,Number=1,Type=Integer,Description=\"Mutation ID in SLiM\">"
## [1] "##INFO=<ID=S,Number=1,Type=Float,Description=\"Selection Coefficient\">"
## [1] "##INFO=<ID=DOM,Number=1,Type=Float,Description=\"Dominance\">"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM POS   ID REF ALT QUAL   FILTER
## [1,] "1"   "43"  NA "A" "T" "1000" "PASS"
## [2,] "1"   "104" NA "A" "T" "1000" "PASS"
## [3,] "1"   "160" NA "A" "T" "1000" "PASS"
## [4,] "1"   "204" NA "A" "T" "1000" "PASS"
## [5,] "1"   "214" NA "A" "T" "1000" "PASS"
## [6,] "1"   "275" NA "A" "T" "1000" "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT i0    i1    i2    i3    i4   
## [1,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [2,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [3,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [4,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [5,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [6,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [1] "First 6 columns only."
## [1] 
## [1] "Unique GT formats:"
## [1] "GT"
## [1]
  head(vcf)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=20170924"
## [1] "##source=SLiM"
## [1] "##INFO=<ID=MID,Number=1,Type=Integer,Description=\"Mutation ID in SLiM\">"
## [1] "##INFO=<ID=S,Number=1,Type=Float,Description=\"Selection Coefficient\">"
## [1] "##INFO=<ID=DOM,Number=1,Type=Float,Description=\"Dominance\">"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM POS      ID REF ALT QUAL   FILTER
## [1,] "2"   "55167"  NA "A" "T" "1000" "PASS"
## [2,] "7"   "327919" NA "A" "T" "1000" "PASS"
## [3,] "7"   "328998" NA "A" "T" "1000" "PASS"
## [4,] "5"   "243497" NA "A" "T" "1000" "PASS"
## [5,] "4"   "153461" NA "A" "T" "1000" "PASS"
## [6,] "4"   "183906" NA "A" "T" "1000" "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT i0    i1    i2    i3    i4   
## [1,] "GT"   "1|1" "1|1" "1|1" "1|1" "1|1"
## [2,] "GT"   "0|0" "1|1" "0|0" "1|0" "0|0"
## [3,] "GT"   "0|0" "1|1" "0|0" "1|0" "0|0"
## [4,] "GT"   "1|0" "0|1" "1|1" "0|0" "1|1"
## [5,] "GT"   "0|0" "0|0" "0|1" "0|1" "0|0"
## [6,] "GT"   "0|0" "1|0" "0|0" "0|0" "0|0"
## [1] "First 6 columns only."
## [1] 
## [1] "Unique GT formats:"
## [1] "GT"
## [1]

Plotting functions

plot_r_legend <- function(){
   ### Plot recombination legend
  xl <- 1
  yb <- 1
  xr <- 1.5
  yt <- 2
  
  ncol = 9
  par(mar=c(5.1,2.5,3.1,0.5))
  plot(NA,type="n",ann=FALSE,xlim=c(1,2),ylim=c(1,2),xaxt="n",yaxt="n",bty="n")
  mtext("r", side=3, adj=0.2, cex=2)
  rect(
       xl,
       head(seq(yb,yt,(yt-yb)/ncol),-1),
       xr,
       tail(seq(yb,yt,(yt-yb)/ncol),-1),
       col=colfunc
      )
  mtext(c(10^(round(brks)[1:9]),0.5),side=2,at=seq(yb,yt,(yt-yb)/(ncol)),las=2,cex=0.7)
}
  
  
### Plot function
  recom_end2 = c(0, recom_end)
plot_layers <- function(y_head=0, y_arrows=c(1,0.25), ...){
  ### Plot recombination variation
  for (i in 1:nrow(recom))
  {
    polygon(x = c(recom_end2[i], recom$recom_end[i], recom$recom_end[i], recom_end2[i]),
            y = c(-1000, -1000, 1000, 1000),
            col=as.character(recom$col[i]), border = NA)
  }
  abline(v=lgs)
  
  text(lg_whereplot, y = y_head, 
       labels = c("LG1\nNeut", "LG2\nQTL", "LG3\nQTL",
                  "LG4\nSS", "LG5\nBS",
                  "LG6\nlow r+\nBS", "LG7\nlow r",
                  "LG8\nmed r", "LG9\nr var", "LG10\nNeut"))
  
  
  ### Add QTLs and Sweep Location
  arrows(muts$position[muts$count],  y_arrows[1], muts$position[muts$count],  y_arrows[2], col="orange", lwd=muts$prop[muts$count]*20, length = 0.1)
  arrows(muts$position[muts$type=="m4"], y_arrows[1], muts$position[muts$type=="m4"], y_arrows[2], col="purple", lwd=2, length = 0.1)
} #end plot function

layout(matrix(1:2,nrow=1),widths=c(0.8,0.2))
par(mar=c(5.1,3.1,3.1,1.9))
plot(0,0, col="white", xlim=c(0, 500000), ylim=c(-1,1), xaxs="i", yaxt="n", ylab="", xlab="Position (bp)")
plot_layers()
plot_r_legend()

Conversion script

# NB: Creates a vcfR object (stored in RAM) which size is twice as big as the original vcf file. So when dealing with large data, make sure you have enough RAM space available before proceeding.

geno <- extract.gt(vcf2) # Character matrix containing the genotypes
position <- getPOS(vcf2) # Positions in bp
chromosome <- getCHROM(vcf2) # Chromosome information

G <- matrix(NA, nrow = nrow(geno), ncol = ncol(geno))

G[geno %in% c("0/0", "0|0")] <- 0
G[geno  %in% c("0/1", "1/0", "1|0", "0|1")] <- 1
G[geno %in% c("1/1", "1|1")] <- 2
## Remove fixed loci or all heterozygotes ####
dim(G)
## [1] 12736  1000
head(G[1:10,1:10])
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    0    0    0    0    0    0    0    0    0     0
## [2,]    0    0    0    0    0    0    0    0    0     0
## [3,]    0    0    0    0    0    0    0    0    0     1
## [4,]    0    0    0    0    0    0    0    0    0     0
## [5,]    0    0    0    0    0    0    0    0    0     0
## [6,]    0    0    0    0    0    0    0    0    0     0
rem = c(which(rowSums(G)==0), which(rowSums(G-2)==0)) ## fixed loci
position[rem]
## [1]  55167  66265  98974 119927 131675 141983
training <- list(G = G[-rem,], position = position[-rem], chromosome = chromosome[-rem])
vcf_filt <- vcf2[-rem,]
dim(vcf_filt@gt)
## [1] 12730  1001

Assign individuals to populations

  ### optional assignment to pops
  toclust <- ind0[,c("x","y")]
  d <- dist(toclust)
  hc <- hclust(d, method="ward.D")
  #fit <- kmeans(toclust, 30) 
  #plot(ind$x, ind$y, pch=fit$cluster, col=fit$cluster+1)
  k <- 39
  group <- cutree(hc, k=k)
  table(group)
## group
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 
## 33 36 27 22 27 13 26 18 19 18 27 26 24 36 34 23 15 33 46 54 24 28 16 17 32 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 
## 42 10 37  8 20 22 34 33 26 23 20 20 19 12
  (group_env <- sort(round(tapply(ind0$envi, group, mean), 1)))
##   37    1   25   23   36   14   20   30   28   32   18   24   26    2   27 
## -1.6 -1.3 -1.3 -1.1 -1.1 -1.0 -1.0 -1.0 -0.7 -0.7 -0.5 -0.5 -0.5 -0.4 -0.4 
##   12   31    4    9   10    3   34    8   38   13   21   22    5   11   15 
## -0.3 -0.3 -0.2 -0.2 -0.2 -0.1 -0.1  0.0  0.0  0.1  0.3  0.3  0.4  0.5  0.5 
##   33    7   35   17   39    6   16   29   19 
##  0.6  0.7  0.7  0.8  0.8  0.9  0.9  1.0  1.1
  (group_table <- data.frame(group=names(group_env), group_env, new_g = 1:39))
##    group group_env new_g
## 37    37      -1.6     1
## 1      1      -1.3     2
## 25    25      -1.3     3
## 23    23      -1.1     4
## 36    36      -1.1     5
## 14    14      -1.0     6
## 20    20      -1.0     7
## 30    30      -1.0     8
## 28    28      -0.7     9
## 32    32      -0.7    10
## 18    18      -0.5    11
## 24    24      -0.5    12
## 26    26      -0.5    13
## 2      2      -0.4    14
## 27    27      -0.4    15
## 12    12      -0.3    16
## 31    31      -0.3    17
## 4      4      -0.2    18
## 9      9      -0.2    19
## 10    10      -0.2    20
## 3      3      -0.1    21
## 34    34      -0.1    22
## 8      8       0.0    23
## 38    38       0.0    24
## 13    13       0.1    25
## 21    21       0.3    26
## 22    22       0.3    27
## 5      5       0.4    28
## 11    11       0.5    29
## 15    15       0.5    30
## 33    33       0.6    31
## 7      7       0.7    32
## 35    35       0.7    33
## 17    17       0.8    34
## 39    39       0.8    35
## 6      6       0.9    36
## 16    16       0.9    37
## 29    29       1.0    38
## 19    19       1.1    39
  ind0$group <- group
  ind2 <- merge(ind0, group_table)
  head(ind2)
##   group  id        x        y phenotype1     envi phenotype2 group_env
## 1     1   0 0.967790 0.156432  0.1855210 -1.23276  0.1855210      -1.3
## 2     1 766 0.989254 0.184359  0.3714770 -1.29156  0.3714770      -1.3
## 3     1 831 0.966644 0.155146 -1.3844000 -1.22898 -1.3844000      -1.3
## 4     1 813 0.997858 0.192452 -0.0842549 -1.31945 -0.0842549      -1.3
## 5     1 783 0.994767 0.190174  0.1650630 -1.30821  0.1650630      -1.3
## 6     1 650 0.962838 0.159555 -1.1741400 -1.19981 -1.1741400      -1.3
##   new_g
## 1     2
## 2     2
## 3     2
## 4     2
## 5     2
## 6     2
    # the merge puts individuals out of order relative to the genotype matrix
    # the id should put them back into order
  ind <- ind2[order(ind2$id),]
  head(ind)
##     group id        x         y phenotype1      envi phenotype2 group_env
## 1       1  0 0.967790 0.1564320  0.1855210 -1.232760  0.1855210      -1.3
## 50      2  1 0.242315 0.0480874 -0.2245730 -0.408370 -0.2245730      -0.4
## 82      3  2 0.816066 0.5568300  0.0472845 -0.114717  0.0472845      -0.1
## 104     4  3 0.770902 0.9594540 -0.8786200 -0.207345 -0.8786200      -0.2
## 119     5  4 0.380837 0.0457108  0.5888120  0.368229  0.5888120       0.4
## 158     6  5 0.320940 0.5404820  0.2488010  0.881583  0.2488010       0.9
##     new_g
## 1       2
## 50     14
## 82     21
## 104    18
## 119    28
## 158    36
  par(mfrow=c(1,1))
  plot(ind$x, ind$y, pch=ind$group%%6, col=adjustcolor(ind$group%%3+2, alpha=0.2))
    text(tapply(ind$x, ind$new_g, mean), tapply(ind$y, ind$new_g, mean), label=1:39)

  #write.table(ind, "outputIndAll_pop.txt")

LD pruned set of loci

 G0<-add_code256(big_copy(t(training$G),type="raw"),code=bigsnpr:::CODE_012)
## Warning: Assignment will down cast from double to raw.
## Hint: To remove this warning, use options(bigstatsr.typecast.warning = FALSE).
    #puts it in the raw format and stores likelihood genotype probability
  dim(G0)
## [1]  1000 12730
  str(G)
##  num [1:12736, 1:1000] 0 0 0 0 0 0 0 0 0 0 ...
  head(G[,1:10])
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    0    0    0    0    0    0    0    0    0     0
## [2,]    0    0    0    0    0    0    0    0    0     0
## [3,]    0    0    0    0    0    0    0    0    0     1
## [4,]    0    0    0    0    0    0    0    0    0     0
## [5,]    0    0    0    0    0    0    0    0    0     0
## [6,]    0    0    0    0    0    0    0    0    0     0
  newpc<-snp_autoSVD(G=G0,infos.chr = training$chromosome,infos.pos = training$position)
## Phase of clumping at r2 > 0.2.. keep 7438 SNPs.
## 
## Iteration 1:
## Computing SVD..
## 
## Converged!
    # this is doing SNP pruning - removing correlated SNPs
    # take snps with highest MAF and correlate snps around it
    # Snps with R^2 > 0.2 are removed
    # the subset is the indexes of the remaining SNPs
  str(newpc)
## List of 7
##  $ d     : num [1:10] 208 164 162 159 158 ...
##  $ u     : num [1:1000, 1:10] 0.00369 0.01023 0.00133 0.01267 -0.03907 ...
##  $ v     : num [1:7438, 1:10] -0.00236 -0.00826 0.01564 -0.00537 0.00347 ...
##  $ niter : int 10
##  $ nops  : int 178
##  $ center: num [1:7438] 0.002 0.016 0.067 0.001 0.003 0.105 0.014 0.002 0.006 0.005 ...
##  $ scale : num [1:7438] 0.0447 0.126 0.2545 0.0316 0.0547 ...
##  - attr(*, "class")= chr "big_SVD"
##  - attr(*, "subset")= int [1:7438] 1 2 3 4 6 7 8 9 11 12 ...
##  - attr(*, "lrldr")='data.frame':    0 obs. of  3 variables:
##   ..$ Chr  : int(0) 
##   ..$ Start: int(0) 
##   ..$ Stop : int(0)
  plot(newpc) 

  which_pruned <- attr(newpc, which="subset")
  length(which_pruned)
## [1] 7438
### Calculate average LD around each SNP ####  
  LD <- LD2<- rep(NA, ncol(G0))
    # the following is not my most efficient lines of code
  for(i in 26:(ncol(G0)-26)){
    LD[i]=mean(cor(G0[,(i-25):(i+25)])[,25])
  }

  layout(matrix(1))
  plot(training$position, abs(LD), pch=20, ylim=c(0,0.18), xaxs="i", col=rgb(0,0,0,0.5), xlab="Position (bp)", ylab="Average LD 50-SNP windows")
  plot_layers(y_head=0.17, y_arrows=c(0.02, 0))

hist(training$position[which_pruned], breaks=seq(0,500000, by=10000), col="lightgrey")
plot_layers(y_head=20, y_arrows=c(40,25))

Population structure

Principle components based on all data

aux<-pcadapt(training$G,K=10)
## Number of SNPs: 12730
## Number of individuals: 1000
str(aux)
## List of 10
##  $ scores         : num [1:1000, 1:10] 0.0287 -0.0677 0.0303 -0.0179 0.0331 ...
##  $ singular.values: num [1:10] 11.48 7.6 6.31 5.91 5.63 ...
##  $ zscores        : num [1:12730, 1:10] 0 0 0 0 0 ...
##  $ loadings       : num [1:12730, 1:10] 0 0 0 0 0 ...
##  $ maf            : num [1:12730] 0.001 0.008 0.0335 0.0005 0.0015 0.0015 0.0525 0.007 0.001 0.0005 ...
##  $ missing        : num [1:12730] 0 0 0 0 0 0 0 0 0 0 ...
##  $ stat           : num [1:12730(1d)] NA NA NA NA NA ...
##  $ gif            : num 1.21
##  $ chi2.stat      : num [1:12730(1d)] NA NA NA NA NA ...
##  $ pvalues        : num [1:12730] NA NA NA NA NA ...
##  - attr(*, "class")= chr "pcadapt"
##  - attr(*, "K")= num 10
##  - attr(*, "data.type")= chr "genotype"
##  - attr(*, "method")= chr "mahalanobis"
##  - attr(*, "min.maf")= num 0.05
plot(aux,option="screeplot")

x <-pcadapt(training$G,K=4)
## Number of SNPs: 12730
## Number of individuals: 1000
summary(x)
##                 Length Class  Mode   
## scores           4000  -none- numeric
## singular.values     4  -none- numeric
## zscores         50920  -none- numeric
## loadings        50920  -none- numeric
## maf             12730  -none- numeric
## missing         12730  -none- numeric
## stat            12730  -none- numeric
## gif                 1  -none- numeric
## chi2.stat       12730  -none- numeric
## pvalues         12730  -none- numeric
par(mar=c(4,4,1,1))
plot(x$scores[,1], x$scores[,2])

inlowr <- which(training$position>(325000-50) & training$position<(325000+50))
(snppos_inlowr <- training$position[inlowr])
## [1] 324954 324968 324969
haplotype <- as.factor(paste(training$G[inlowr[1],], training$G[inlowr[2],], training$G[inlowr[3],], sep="_"))
table(haplotype)
## haplotype
## 0_0_0 0_0_1 0_0_2 0_1_0 0_1_1 0_2_0 1_1_0 1_2_0 
##   389    70     5   403    32    93     5     3
qplot(x$scores[,1], x$scores[,2], colour=haplotype, main="Individual scores without LD pruning") + scale_colour_manual(values = two.colors(n=8,"red", "blue", middle="grey")) + theme_bw()

Loading of genomic regions onto PC axes calculated from all data

#plot_layers(ylim=c(min(x$loadings[,1]), max(x$loadings[,1])), ylab="Loadings PC1")
layout(matrix(c(1,2,3,4,6,5,5,6),nrow=4),widths=c(0.8,0.2))
par(oma=c(3,3,1,0), mar=c(2,2,0,0))
## Top plot
  summary(x$loadings[,1])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -9.714207  0.000000  0.000000 -0.002193  0.000000  9.714207
  plot(training$position,x$loadings[,1], xaxs="i", pch=20, ylim=c(-11, 15)) 
  plot_layers(y_head = 12, y_arrows=c(-8, -11))
## Middle plot
  summary(x$loadings[,2])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -11.33192   0.00000   0.00000   0.01442   0.00000  11.62424
  plot(training$position,x$loadings[,2], xaxs="i", pch=20, ylim=c(-11, 13)) 
  plot_layers(y_head = 20, y_arrows=c(-8, -11))
##  Middle plot 
  summary(x$loadings[,3])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -10.76386   0.00000   0.00000  -0.02859   0.00000  10.76386
  plot(training$position,x$loadings[,3], xaxs="i", pch=20, ylim=c(-15, 15)) 
  plot_layers(y_head = 20, y_arrows=c(-12, -15))
##  Bottom plot 
  summary(x$loadings[,4])
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -17.805202   0.000000   0.000000   0.007629   0.000000  17.805202
  plot(training$position,x$loadings[,3], xaxs="i", pch=20, ylim=c(-13, 13)) 
  plot_layers(y_head = 20, y_arrows=c(-10, -13))
## Right plot
  plot_r_legend()
  mtext("Position (bp)", outer=TRUE, side=1, line=1, adj=0.4)
  mtext("Loading on PC axis", outer=TRUE, side=2, line=1, adj=0.5)

Principle components based on pruned data

dim(training$G)
## [1] 12730  1000
aux<-pcadapt(training$G[which_pruned,],K=30)
## Number of SNPs: 7438
## Number of individuals: 1000
str(aux)
## List of 10
##  $ scores         : num [1:1000, 1:30] -0.000653 0.015434 0.00039 0.012151 -0.051125 ...
##  $ singular.values: num [1:30] 4.81 3.77 3.73 3.61 3.58 ...
##  $ zscores        : num [1:7438, 1:30] 0 0 0 0 0 ...
##  $ loadings       : num [1:7438, 1:30] 0 0 0 0 0 ...
##  $ maf            : num [1:7438] 0.001 0.008 0.0335 0.0005 0.0015 0.0525 0.007 0.001 0.003 0.0025 ...
##  $ missing        : num [1:7438] 0 0 0 0 0 0 0 0 0 0 ...
##  $ stat           : num [1:7438(1d)] NA NA NA NA NA ...
##  $ gif            : num 1.11
##  $ chi2.stat      : num [1:7438(1d)] NA NA NA NA NA ...
##  $ pvalues        : num [1:7438] NA NA NA NA NA ...
##  - attr(*, "class")= chr "pcadapt"
##  - attr(*, "K")= num 30
##  - attr(*, "data.type")= chr "genotype"
##  - attr(*, "method")= chr "mahalanobis"
##  - attr(*, "min.maf")= num 0.05
plot(aux,option="screeplot")

x_LD <-pcadapt(training$G[which_pruned,],K=3)
## Number of SNPs: 7438
## Number of individuals: 1000
summary(x_LD)
##                 Length Class  Mode   
## scores           3000  -none- numeric
## singular.values     3  -none- numeric
## zscores         22314  -none- numeric
## loadings        22314  -none- numeric
## maf              7438  -none- numeric
## missing          7438  -none- numeric
## stat             7438  -none- numeric
## gif                 1  -none- numeric
## chi2.stat        7438  -none- numeric
## pvalues          7438  -none- numeric
par(mar=c(4,4,1,1))

qplot(x_LD$scores[,1], x_LD$scores[,2], colour=ind$envi, main="Individual scores with LD pruning", xlab="PC1 scores", ylab="PC2 scores" ) + theme_bw() + labs(colour="Envi") + scale_color_gradient2( low="blue", mid="white",
                     high="red", space ="Lab" )

Loading of genomic regions onto PC axes calculated from pruned data

#plot_layers(ylim=c(min(x$loadings[,1]), max(x$loadings[,1])), ylab="Loadings PC1")
layout(matrix(c(1,2,3,3),nrow=2),widths=c(0.8,0.2))
par(oma=c(3,3,1,0), mar=c(2,2,0,0))
## Top plot
  summary(x_LD$loadings[,1])
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -9.91659  0.00000  0.00000 -0.02043  0.00000 11.68196
  plot(training$position[which_pruned],x_LD$loadings[,1], xaxs="i", pch=20, ylim=c(-11, 15)) 
  plot_layers(y_head = 12, y_arrows=c(-5, -11))
## Middle plot
  summary(x_LD$loadings[,2])
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -6.011610  0.000000  0.000000  0.008229  0.000000  8.099326
  plot(training$position[which_pruned],x_LD$loadings[,2], xaxs="i", pch=20, ylim=c(-11, 13)) 
  plot_layers(y_head = 20, y_arrows=c(-8, -11))
## Right plot
  plot_r_legend()
  mtext("Position (bp)", outer=TRUE, side=1, line=1, adj=0.4)
  mtext("Loading on PC axis", outer=TRUE, side=2, line=1, adj=0.5)

Admixture/ancestry based on all data (snmf in LEA package)

write.geno(t(training$G), "genotypes.geno")
## [1] "genotypes.geno"
project = snmf("genotypes.geno",
                K = 1:4, 
                entropy = TRUE, 
                repetitions = 3,
                project = "new")
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] 1914330943
## [1] "*************************************"
## [1] "*          create.dataset            *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)                 1000
##         -L (number of loci)                        12730
##         -s (seed random init)                      1914330943
##         -r (percentage of masked data)             0.05
##         -x (genotype file in .geno format)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -o (output file in .geno format)           /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## 
##  Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## [1] "*************************************"
## [1] "* sNMF K = 1  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          1
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140460229812031
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
## 
## Least-square error: 2184733.310493
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      1
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run1/genotypes_r1.1.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.297424
## Cross-Entropy (masked data):  0.299513
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 2  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          2
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  4595222878120206143
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [======]
## Number of iterations: 16
## 
## Least-square error: 2133085.167554
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      2
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run1/genotypes_r1.2.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.290459
## Cross-Entropy (masked data):  0.294316
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 3  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          3
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140460229812031
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [===============]
## Number of iterations: 39
## 
## Least-square error: 2115270.930074
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      3
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run1/genotypes_r1.3.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.28694
## Cross-Entropy (masked data):  0.292057
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 4  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          4
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140460229812031
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=============================]
## Number of iterations: 77
## 
## Least-square error: 2104544.184406
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      4
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run1/genotypes_r1.4.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.285632
## Cross-Entropy (masked data):  0.290917
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] 1658766106
## [1] "*************************************"
## [1] "*          create.dataset            *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)                 1000
##         -L (number of loci)                        12730
##         -s (seed random init)                      1658766106
##         -r (percentage of masked data)             0.05
##         -x (genotype file in .geno format)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -o (output file in .geno format)           /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## 
##  Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## [1] "*************************************"
## [1] "* sNMF K = 1  repetition 2      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          1
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459974247194
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
## 
## Least-square error: 2184045.338543
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      1
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run2/genotypes_r2.1.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.297444
## Cross-Entropy (masked data):  0.299182
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 2  repetition 2      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          2
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  4636033605571625754
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [======]
## Number of iterations: 16
## 
## Least-square error: 2133438.318838
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      2
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run2/genotypes_r2.2.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.290486
## Cross-Entropy (masked data):  0.293674
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 3  repetition 2      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          3
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459974247194
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [==============]
## Number of iterations: 38
## 
## Least-square error: 2115496.525583
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      3
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run2/genotypes_r2.3.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.286974
## Cross-Entropy (masked data):  0.291713
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 4  repetition 2      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          4
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459974247194
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [===========================]
## Number of iterations: 71
## 
## Least-square error: 2104443.508871
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      4
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run2/genotypes_r2.4.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.285638
## Cross-Entropy (masked data):  0.290789
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] 445680127
## [1] "*************************************"
## [1] "*          create.dataset            *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)                 1000
##         -L (number of loci)                        12730
##         -s (seed random init)                      445680127
##         -r (percentage of masked data)             0.05
##         -x (genotype file in .geno format)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -o (output file in .geno format)           /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
## 
##  Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## [1] "*************************************"
## [1] "* sNMF K = 1  repetition 3      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          1
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  445680127
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
## 
## Least-square error: 2184839.526636
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      1
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K1/run3/genotypes_r3.1.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.297387
## Cross-Entropy (masked data):  0.300231
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 2  repetition 3      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          2
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140458761161215
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [======]
## Number of iterations: 16
## 
## Least-square error: 2132647.984240
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      2
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K2/run3/genotypes_r3.2.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.290421
## Cross-Entropy (masked data):  0.294919
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 3  repetition 3      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          3
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  4612136376258824703
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [===================]
## Number of iterations: 50
## 
## Least-square error: 2115523.051283
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      3
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K3/run3/genotypes_r3.3.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.286917
## Cross-Entropy (masked data):  0.292771
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 4  repetition 3      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    12730
##         -K (number of ancestral pops)          4
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140458761161215
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno:     OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=========================]
## Number of iterations: 68
## 
## Least-square error: 2104444.538962
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.Q:        OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.G: OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                12730
##         -K (number of ancestral pops)      4
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/K4/run3/genotypes_r3.4.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes.snmf/masked/genotypes_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.285599
## Cross-Entropy (masked data):  0.291981
## The project is saved into :
##  genotypes.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes.snmfProject")
par(mfrow=c(1,1), mar=c(3,3,3,1))
#project
# plot cross-entropy criterion of all runs of the project
plot(project, cex = 1.2, col = "blue", pch = 19)

# get the cross-entropy of all runs for K = 3
ce = cross.entropy(project, K = 2)
ce
##           K = 2
## run 1 0.2943160
## run 2 0.2936740
## run 3 0.2949187
# select the run with the lowest cross-entropy for K = 2
best = which.min(ce)

# display the Q-matrix
Q.matrix <- as.matrix(Q(project, K = 2, run = best))
dim(Q.matrix)
## [1] 1000    2
cluster <- apply(Q.matrix, 1, which.max)
my.colors <- c("tomato", "lightblue", "olivedrab")#, "gold")

ord <- order(ind$envi)
dim(Q.matrix)
## [1] 1000    2
bp <- barplot(t(Q.matrix[ord,]), 
        border = NA, 
        space = 0, 
        col = my.colors, 
        xlab = "Individuals",
        ylab = "Ancestry proportions", 
        main = "Ancestry matrix") 

#axis(1, at = 1:nrow(Q.matrix), labels = bp$order, las = 3, cex.axis = .4)


# get the ancestral genotype frequency matrix, G, for the 2nd run for K = 4. 
G.matrix = G(project, K = 3, run = 2)

Admixture/ancestry based on pruned data (snmf in LEA package)

write.geno(t(training$G[which_pruned,]), "genotypes_LD.geno")
## [1] "genotypes_LD.geno"
project_LD = snmf("genotypes_LD.geno",
                K = 1:4, 
                entropy = TRUE, 
                repetitions = 3,
                project = "new")
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] 794266710
## [1] "*************************************"
## [1] "*          create.dataset            *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)                 1000
##         -L (number of loci)                        7438
##         -s (seed random init)                      794266710
##         -r (percentage of masked data)             0.05
##         -x (genotype file in .geno format)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -o (output file in .geno format)           /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## 
##  Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## [1] "*************************************"
## [1] "* sNMF K = 1  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          1
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459109747798
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
## 
## Least-square error: 1116941.990039
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      1
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run1/genotypes_LD_r1.1.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.261933
## Cross-Entropy (masked data):  0.263142
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 2  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          2
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  4602678819966913622
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=======================]
## Number of iterations: 62
## 
## Least-square error: 1109787.255171
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      2
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run1/genotypes_LD_r1.2.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.25924
## Cross-Entropy (masked data):  0.26154
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 3  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          3
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459109747798
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [==================================================]
## Number of iterations: 133
## 
## Least-square error: 1105352.995343
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      3
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run1/genotypes_LD_r1.3.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.258182
## Cross-Entropy (masked data):  0.261556
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 4  repetition 1      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          4
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459109747798
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=====================================================================]
## Number of iterations: 185
## 
## Least-square error: 1101496.920349
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      4
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run1/genotypes_LD_r1.4.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.257073
## Cross-Entropy (masked data):  0.261336
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] 218275337
## [1] "*************************************"
## [1] "*          create.dataset            *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)                 1000
##         -L (number of loci)                        7438
##         -s (seed random init)                      218275337
##         -r (percentage of masked data)             0.05
##         -x (genotype file in .geno format)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -o (output file in .geno format)           /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## 
##  Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## [1] "*************************************"
## [1] "* sNMF K = 1  repetition 2      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          1
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140458533756425
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
## 
## Least-square error: 1117093.198247
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      1
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run2/genotypes_LD_r2.1.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.261897
## Cross-Entropy (masked data):  0.263818
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 2  repetition 2      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          2
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140458533756425
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [==================]
## Number of iterations: 47
## 
## Least-square error: 1110177.425147
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      2
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run2/genotypes_LD_r2.2.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.259193
## Cross-Entropy (masked data):  0.262317
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 3  repetition 2      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          3
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140458533756425
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [===========================================================================]
## Number of iterations: 200
## 
## Least-square error: 1105595.059315
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      3
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run2/genotypes_LD_r2.3.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.258112
## Cross-Entropy (masked data):  0.262279
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 4  repetition 2      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          4
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140458533756425
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=====================================]
## Number of iterations: 100
## 
## Least-square error: 1101620.754549
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      4
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run2/genotypes_LD_r2.4.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.257056
## Cross-Entropy (masked data):  0.262195
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] 1153480913
## [1] "*************************************"
## [1] "*          create.dataset            *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)                 1000
##         -L (number of loci)                        7438
##         -s (seed random init)                      1153480913
##         -r (percentage of masked data)             0.05
##         -x (genotype file in .geno format)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -o (output file in .geno format)           /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
## 
##  Write genotype file with masked data, /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## [1] "*************************************"
## [1] "* sNMF K = 1  repetition 3      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          1
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459468962001
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
## 
## Least-square error: 1116617.652157
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      1
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K1/run3/genotypes_LD_r3.1.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.261804
## Cross-Entropy (masked data):  0.265613
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 2  repetition 3      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          2
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  9743415505
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [===============================]
## Number of iterations: 83
## 
## Least-square error: 1109553.171174
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      2
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K2/run3/genotypes_LD_r3.2.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.259097
## Cross-Entropy (masked data):  0.26412
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 3  repetition 3      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          3
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459468962001
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [=======================================================================]
## Number of iterations: 190
## 
## Least-square error: 1105254.842039
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      3
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K3/run3/genotypes_LD_r3.3.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.258024
## Cross-Entropy (masked data):  0.264091
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
## 
## [1] "*************************************"
## [1] "* sNMF K = 4  repetition 3      *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)             1000
##         -L (number of loci)                    7438
##         -K (number of ancestral pops)          4
##         -x (input file)                        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         -q (individual admixture file)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.Q
##         -g (ancestral frequencies file)        /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.G
##         -i (number max of iterations)          200
##         -a (regularization parameter)          10
##         -s (seed random init)                  140459468962001
##         -e (tolerance error)                   1E-05
##         -p (number of processes)               1
##         - diploid
## 
## Read genotype file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno:       OK.
## 
## 
## Main algorithm:
##  [                                                                           ]
##  [===========================================================================]
## Number of iterations: 200
## 
## Least-square error: 1101273.674557
## Write individual ancestry coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.Q:      OK.
## Write ancestral allele frequency coefficient file /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.G:   OK.
## 
## [1] "*************************************"
## [1] "*    cross-entropy estimation       *"
## [1] "*************************************"
## summary of the options:
## 
##         -n (number of individuals)         1000
##         -L (number of loci)                7438
##         -K (number of ancestral pops)      4
##         -x (genotype file)                 /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.geno
##         -q (individual admixture)          /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.Q
##         -g (ancestral frequencies)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/K4/run3/genotypes_LD_r3.4.G
##         -i (with masked genotypes)         /Users/katie/Desktop/RecombinationGenomeScans/genotypes_LD.snmf/masked/genotypes_LD_I.geno
##         - diploid
## 
## Cross-Entropy (all data):     0.25694
## Cross-Entropy (masked data):  0.263926
## The project is saved into :
##  genotypes_LD.snmfProject 
## 
## To load the project, use:
##  project = load.snmfProject("genotypes_LD.snmfProject")
## 
## To remove the project, use:
##  remove.snmfProject("genotypes_LD.snmfProject")
#project
# plot cross-entropy criterion of all runs of the project
plot(project_LD, cex = 1.2, col = "blue", pch = 19)

# get the cross-entropy of all runs for K = 3
ce = cross.entropy(project_LD, K = 2)
ce
##           K = 2
## run 1 0.2615403
## run 2 0.2623173
## run 3 0.2641197
# select the run with the lowest cross-entropy for K = 2
best = which.min(ce)

# display the Q-matrix
Q.matrix <- as.matrix(Q(project_LD, K = 2, run = best))
dim(Q.matrix)
## [1] 1000    2
cluster <- apply(Q.matrix, 1, which.max)
my.colors <- c("tomato", "lightblue")#, "olivedrab")#, "gold")

ord <- order(ind$envi)
dim(Q.matrix)
## [1] 1000    2
par(mfrow=c(1,1), mar=c(4,4,3,1))
bp <- barplot(t(Q.matrix[ord,]), 
        border = NA, 
        space = 0, 
        col = my.colors, 
        xlab = "Individuals (sorted by environment)",
        ylab = "Ancestry proportions", 
        main = "Ancestry matrix") 

#axis(1, at = 1:nrow(Q.matrix), labels = bp$order, las = 3, cex.axis = .4)


# get the ancestral genotype frequency matrix, G, for the 2nd run for K = 4. 
G.matrix = G(project, K = 3, run = 2)

OutFLANK

All data

dim(training$G)
## [1] 12730  1000
FstDataFrame <- MakeDiploidFSTMat(t(training$G),training$position,ind$group)
## Calculating FSTs, may take a few minutes...
## [1] "10000 done of 12730"
head(FstDataFrame)
##   LocusName        He         FST           T1           T2  FSTNoCorr
## 1        43 0.0019980 0.003210998 3.209683e-06 0.0009995906 0.02265019
## 2       104 0.0158720 0.002094303 1.662963e-05 0.0079404119 0.02143843
## 3       160 0.0647555 0.033544517 1.087723e-03 0.0324262676 0.05356826
## 4       204 0.0009995 0.002397900 1.199032e-06 0.0005000340 0.02187832
## 5       214 0.0029955 0.025341903 3.800214e-05 0.0014995771 0.04391575
## 6       275 0.0029955 0.004571143 6.850739e-06 0.0014986929 0.02394794
##       T1NoCorr     T2NoCorr meanAlleleFreq
## 1 2.264259e-05 0.0009996647         0.9990
## 2 1.702425e-04 0.0079409973         0.9920
## 3 1.737151e-03 0.0324287423         0.9665
## 4 1.094072e-05 0.0005000712         0.9995
## 5 6.585972e-05 0.0014996832         0.9985
## 6 3.589326e-05 0.0014988036         0.9985
#str(FstDataFrame)
plot(FstDataFrame$He, FstDataFrame$FST)

plot(as.numeric(FstDataFrame$LocusName)[FstDataFrame$He>0.1], FstDataFrame$FST[FstDataFrame$He>0.1], ylim=c(0,0.1))
plot_layers(y_head=0.1, y_arrows=c(0.1,0.06))

k <- 39 ## Number of pops 
out_ini <- OutFLANK(FstDataFrame, NumberOfSamples=k) 
  ## Run outflank on FST dataframe
#out_ini <- OutFLANK(FstDataFrame[FstDataFrame$He>0.05,], NumberOfSamples=k) 
  ## Run outflank without low He loci

# Plot results to compare chi-squared distribution vs. actual FST distribution
OutFLANKResultsPlotter(out_ini, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         FALSE, RightZoomFraction = 0.05, titletext = NULL)

## Poor fit, particularly on right tail
OutFLANKResultsPlotter(out_ini, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         TRUE, RightZoomFraction = 0.15, titletext = NULL)

# Histogram of P-values weird
hist(out_ini$results$pvaluesRightTail,breaks = 20)

With LD pruning

#### LD Pruning ####

#### Evaluating OutFLANK with pruned data ####
plot(FstDataFrame$He[which_pruned], FstDataFrame$FST[which_pruned])

Fstdf2 <- FstDataFrame[which_pruned,] 
dim(Fstdf2)
## [1] 7438    9
Fstdf3 <- Fstdf2[Fstdf2$He>0.05,]

### Trimming without He trimming
out_trim1 <- OutFLANK(Fstdf2, NumberOfSamples=k, Hmin = 0.05)
OutFLANKResultsPlotter(out_trim1, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         FALSE, RightZoomFraction = 0.15, titletext = NULL)

out_trim <- OutFLANK(Fstdf3, NumberOfSamples=k)
  # I do not think that OutFLANK is removing low H loci correctly
  # The fit is much better if I remove these manually than if I do not
head(out_trim$results)
##    LocusName        He         FST           T1         T2  FSTNoCorr
## 3        160 0.0647555 0.033544517 0.0010877235 0.03242627 0.05356826
## 7        311 0.0994875 0.005853766 0.0002913773 0.04977604 0.02451949
## 14       566 0.4890480 0.016621863 0.0040684189 0.24476312 0.03576409
## 17       627 0.4999395 0.011441064 0.0028623924 0.25018585 0.03210544
## 20       687 0.2854875 0.022495971 0.0032147429 0.14290305 0.04028350
## 22       726 0.1563795 0.017020085 0.0013321117 0.07826704 0.03611756
##       T1NoCorr   T2NoCorr meanAlleleFreq indexOrder GoodH   qvalues
## 3  0.001737151 0.03242874         0.9665          1  lowH 0.6983500
## 7  0.001220570 0.04977958         0.9475          2  lowH 0.9983058
## 14 0.008754369 0.24478098         0.5740          3 goodH 0.9983058
## 17 0.008032960 0.25020555         0.4945          4 goodH 0.9983058
## 20 0.005757025 0.14291274         0.8275          5 goodH 0.9983058
## 22 0.002827020 0.07827273         0.9145          6 goodH 0.9983058
##       pvalues pvaluesRightTail OutlierFlag
## 3  0.02555251       0.01277625       FALSE
## 7  0.28227599       0.85886200       FALSE
## 14 0.67903024       0.33951512       FALSE
## 17 0.97532025       0.51233987       FALSE
## 20 0.35583083       0.17791541       FALSE
## 22 0.64902077       0.32451039       FALSE
plot(out_trim$results$He, out_trim$results$FST)

OutFLANKResultsPlotter(out_trim, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         FALSE, RightZoomFraction = 0.15, titletext = NULL)

OutFLANKResultsPlotter(out_trim, withOutliers = TRUE,
                       NoCorr = TRUE, Hmin = 0.1, binwidth = 0.001, Zoom =
                         TRUE, RightZoomFraction = 0.10, titletext = NULL)

# Decent distribution fit, no trimming needed. 
hist(out_trim$results$pvaluesRightTail,breaks = 20)

  # still a conservative histogram, which is typical of outflank

outliers_crit <- out_trim$results$qvalues<0.2
(outliers_LD <- out_trim$results$LocusName[outliers_crit])
## [1] 498325  84667  95065 190575 191782 264811
# outliers identified 

### Plot trimmed data only
plot(out_trim$results$LocusName,out_trim$results$FST, ylim=c(0,0.1), xaxs="i")
plot_layers(y_head=0.1, y_arrows=c(0.1,0.06))
points(out_trim$results$LocusName[outliers_crit], 
       out_trim$results$FST[outliers_crit], pch=19)

### Ad-hoc estimates of p-values for all data
Fstdf_adhoc <- FstDataFrame
new_dist <- FstDataFrame$FSTNoCorr*out_trim$dfInferred/out_trim$FSTNoCorrbar
hist(new_dist)

Fstdf_adhoc$p <- pchisq(new_dist, df = out_trim$dfInferred)
Fstdf_adhoc$p[Fstdf_adhoc$He<0.1] <- NA
hist(Fstdf_adhoc$p)

plot(-log10(1-Fstdf_adhoc$p), Fstdf_adhoc$FST)

Fstdf_adhoc$q <- qvalue(1-Fstdf_adhoc$p)$qvalues
plot(Fstdf_adhoc$q, Fstdf_adhoc$FST)

plot(as.numeric(Fstdf_adhoc$LocusName)[Fstdf_adhoc$He>0.1], Fstdf_adhoc$FST[Fstdf_adhoc$He>0.1], ylim=c(0,0.1), xaxs="i")
  plot_layers(y_head=0.1, y_arrows=c(0.09, 0.07))
  points(Fstdf_adhoc$LocusName[Fstdf_adhoc$q<0.2], Fstdf_adhoc$FST[Fstdf_adhoc$q<0.2], pch=20)

With individuals assigned to pops based on PCs from all data

PCADAPT

All data

plot(x, option = "qqplot", threshold = 0.05, main="pcadapt")

#plot(x, option = "stat.distribution")
summary(x$chi2.stat)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
##      0.00      1.87      3.36   4225.89      6.12 156361.97      8421
# Default output from PCAdapt
  par(mfrow=c(1,1))
  plot(training$position, sqrt(x$chi2.stat), col="black", pch=20, main="pcadapt without LD pruning", ylim=c(0,500))
  plot_layers(y_head=450, y_arrows = c(50,0))

With LD pruning

plot(x_LD, option = "qqplot", threshold = 0.05, main="pcadapt")

#plot(x, option = "stat.distribution")
summary(x_LD$chi2.stat)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.010   1.208   2.366   3.327   4.302  59.470    5239
# Default output from PCAdapt
  par(mfrow=c(1,1))
  plot(training$position[which_pruned], x_LD$chi2.stat, col="black", pch=20, main="pcadapt without LD pruning", ylim=c(0,100))
  plot_layers(y_head=450, y_arrows = c(100,70))

LFMM

Ridge regression model phenotype ~ genotype

# extract scaled genotypes
scaled.genotype <- scale(as.matrix(t(training$G)))
  #scaled.genotype <- as.matrix(t(sim1$G))
# extract scaled phenotypes
phen <- scale(as.matrix(ind$phenotype1))
  # centering is important to remove mean
  # x <- scale(as.matrix(sim1$phenotype1), center=TRUE, scale=FALSE)
  # x <- as.matrix(sim1$phenotype1)
  # to do mean and not SD. this might make it possible to get effect sizes
#pc <- prcomp(scaled.genotype,)
#plot(pc$sdev[1:20]^2)
#points(5,pc$sdev[5]^2, type = "h", lwd = 3, col = "blue")


# ridge regression
lfmm.ridge <- lfmm::lfmm_ridge(Y = scaled.genotype, X = phen, K = 3, lambda = 1e-4)
#The lfmm.ridge object contains estimates for the latent variables and for the effect sizes. Here, the estimates are used for computing calibrated significance values and for testing associations between the response matrix Y and the explanatory variable x. It can be done as follows:

lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = phen, lfmm = lfmm.ridge, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif
## [1] 3.983324
hist(p.values, col = "lightgreen", main="LFMM ridge")

qval <- qvalue::qvalue(p.values)
plot(qval)

#The plot suggests that setting fdr.level = 0.025 warrant few false positives.

qval <- qvalue::qvalue(p.values, fdr.level = 0.005)
candidates <- which(qval$significant)


plot(training$position, -log10(p.values), cex = .5, pch = 19, col = "black", main="LFMM ridge", ylim=c(0, 60))
  plot_layers(y_head=55, y_arrows=c(10, 0))

#### Effect sizes ridge regression step 1: run lfmm_ridge (or any lfmm model) and get the estimated latent factors from the U matrix (obj.lfmm$U). When lfmm is run with K factors and n individuals, U is an n by K matrix.

step 2: perform a standard linear regression analysis of the phenotype on the SNP frequency (in the direction opposite to LFMM) by adding U as covariate to the model. This will estimate the LFMM effect size for each SNP. The R command should look like this: lm( y ~ . , data = data.frame(genotype[,i], U))

str(lfmm.ridge)
## List of 5
##  $ K     : num 3
##  $ lambda: num 1e-04
##  $ U     : num [1:1000, 1:3] 10.97 -26.98 11.8 -6.45 11.18 ...
##  $ V     : num [1:12730, 1:3] 0.003738 -0.003444 0.001665 0.002643 -0.000452 ...
##  $ B     : num [1:12730, 1] 0.00675 0.04897 -0.0848 0.00549 0.02794 ...
##  - attr(*, "class")= chr "ridgeLFMM"
m2 <- which(training$position %in% (muts$position[muts$type=="m2" & muts$count==TRUE]))
dim(G)
## [1] 12736  1000
effects <- data.frame(position=training$position[m2], est_coef_ridge=NA, est_coef_PC=NA)
### Try Olivier's suggestion
for (i in 1:length(m2)){
  effects$est_coef_ridge[i] <- lm(phen ~., data = data.frame(gen = training$G[m2[i],], lfmm.ridge$U))$coef[2]
}

### Use PC axes as covariates
for (i in 1:length(m2)){
  effects$est_coef_PC[i] <- lm(phen ~., data = data.frame(gen = training$G[m2[i],], x_LD$scores))$coef[2]
}
effects
##   position est_coef_ridge est_coef_PC
## 1   110590      0.2589593  0.07340806
(new_muts <- merge(muts,effects))
##   position  selCoef originGen type  freq   pa2       prop count
## 1   110590 0.146014      4546   m2 0.754 0.004 0.03571429  TRUE
##   est_coef_ridge est_coef_PC
## 1      0.2589593  0.07340806
plot(new_muts$selCoef, new_muts$est_coef_ridge, abline(0,1), col="blue", pch=19, xlab="True effect size", ylab="Estimated effect size")
points(new_muts$selCoef, new_muts$est_coef_PC, abline(0,1), col="green", pch=15)

LASSO model

#LFMM parameters can alternatively be estimated by solving regularized least-squares mimimization, with lasso penalty as follows.
lfmm.lasso <- lfmm::lfmm_lasso(Y = scaled.genotype, X = phen, K = 3, nozero.prop = 0.02)
## It = 1/100, err2 = 0.999000000049747
## It = 2/100, err2 = 0.988683353488457
## It = 3/100, err2 = 0.988694883822047
## It = 4/100, err2 = 0.988699730079893
## It = 5/100, err2 = 0.988700739298245
## === lambda = 0.151284210384581, no zero B proportion = 0.0232521602513747
 #The lfmm.lasso object contains new estimates for the latent variables and for the effect sizes. Note that for lasso, we didn't set the value of a regularization parameter. Instead, we set the proportion of non-null effects (here 2 percent).

lfmm.test <- lfmm::lfmm_test(Y = scaled.genotype, X = phen, lfmm = lfmm.lasso, calibrate = "gif")
p.values <- lfmm.test$calibrated.pvalue
lfmm.test$gif
## [1] 1.548855
hist(p.values, col = "lightblue")

qval <- qvalue::qvalue(p.values)
plot(qval)

qval <- qvalue::qvalue(p.values, fdr.level = 0.005)
candidates <- which(qval$significant)

plot(training$position, -log10(p.values), cex = .5, pch = 19, col = "black", main="LFMM lasso", ylim=c(0, 50), xaxs="i")
  plot_layers(y_head=45, y_arrows=c(5,0))

Bayesian (LEA) model

iHS

Conversion scripts

library(rehh)
## Loading required package: rehh.data
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
### Convert vcf@gt to haplotype format .thap
# one file for each chromosome
#SNP1  SNP2  SNP3
#IND1   hap1     A     T     A
#IND1   hap2     A     C     T
#IND2   hap1     G     C     T
#IND2   hap2     A     T     A
nlociperchrom <- table(vcf@fix[,"CHROM"])

substring("0|1", 1, last=1)
## [1] "0"
substring("0|1", 3, last=3)
## [1] "1"
head(vcf_filt)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.2"
## [1] "##fileDate=20170924"
## [1] "##source=SLiM"
## [1] "##INFO=<ID=MID,Number=1,Type=Integer,Description=\"Mutation ID in SLiM\">"
## [1] "##INFO=<ID=S,Number=1,Type=Float,Description=\"Selection Coefficient\">"
## [1] "##INFO=<ID=DOM,Number=1,Type=Float,Description=\"Dominance\">"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM POS   ID REF ALT QUAL   FILTER
## [1,] "1"   "43"  NA "A" "T" "1000" "PASS"
## [2,] "1"   "104" NA "A" "T" "1000" "PASS"
## [3,] "1"   "160" NA "A" "T" "1000" "PASS"
## [4,] "1"   "204" NA "A" "T" "1000" "PASS"
## [5,] "1"   "214" NA "A" "T" "1000" "PASS"
## [6,] "1"   "275" NA "A" "T" "1000" "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT i0    i1    i2    i3    i4   
## [1,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [2,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [3,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [4,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [5,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [6,] "GT"   "0|0" "0|0" "0|0" "0|0" "0|0"
## [1] "First 6 columns only."
## [1] 
## [1] "Unique GT formats:"
## [1] "GT"
## [1]
rem2 <- which(duplicated(vcf_filt@fix[,"POS"]))
sort(vcf_filt@fix[rem2,"POS"])
##   [1] "100074" "100091" "100972" "106904" "110255" "118610" "119269"
##   [8] "119621" "121090" "122662" "129555" "133804" "136833" "144318"
##  [15] "149174" "149498" "161735" "167332" "16845"  "169909" "172360"
##  [22] "176123" "184649" "185817" "188068" "189350" "189451" "193490"
##  [29] "195752" "196359" "196407" "197840" "199591" "200698" "206956"
##  [36] "210085" "216495" "219722" "225496" "227522" "230671" "237178"
##  [43] "237700" "242152" "245561" "245615" "248539" "263579" "265707"
##  [50] "27021"  "284608" "290053" "291324" "294300" "296710" "297069"
##  [57] "298187" "300539" "301776" "307744" "311448" "316403" "319403"
##  [64] "31971"  "322187" "325382" "328998" "333698" "340411" "343427"
##  [71] "344190" "34624"  "348053" "348665" "350232" "353102" "355416"
##  [78] "356165" "357666" "358453" "359762" "362735" "364094" "372484"
##  [85] "372804" "374444" "375849" "376534" "380272" "38119"  "382815"
##  [92] "382934" "383330" "38347"  "383881" "387728" "390172" "390625"
##  [99] "395176" "406774" "408853" "409483" "411912" "422281" "424722"
## [106] "424824" "425698" "428187" "43123"  "434423" "436752" "440005"
## [113] "442018" "444233" "446075" "449955" "450142" "456033" "460666"
## [120] "46096"  "462406" "464065" "465567" "475793" "476720" "479024"
## [127] "483296" "48421"  "485875" "492607" "495206" "498223" "498470"
## [134] "51415"  "51987"  "54824"  "57585"  "60225"  "62208"  "64342" 
## [141] "64983"  "70207"  "70319"  "76531"  "79041"  "81321"  "84987" 
## [148] "85505"  "87397"  "92842"  "97730"  "99544"  "99627"  "99693"
vcf_filt2 = vcf_filt[-rem2,]

### Get into right format
for (i in 1:length(nlociperchrom)){
  keep <- which((vcf_filt2@fix[,"CHROM"]==i))
  head(vcf_filt2@gt[keep,1:10], 10)
  hap1 <- apply(vcf_filt2@gt[keep,-1], 1, FUN=function(x) substring(x,1,1))
  dim(hap1)
  #head(hap1[,1:10])
  hap2 <-  apply(vcf_filt2@gt[keep,-1], 1, FUN=function(x) substring(x,3,3))
  #head(hap2[,1:10])
  
  hapt_out <- matrix(NA, nrow=2*1000, ncol=length(keep))
  odd <- seq(1,(2*1000), by=2)
  even <- odd +1
  hapt_out[odd,] <- as.numeric(hap1)
  rownames(hapt_out) <- rep("", nrow(hapt_out))
  rownames(hapt_out)[odd] <- rownames(hap1)
  rownames(hapt_out)[even] <- rownames(hap2)
  hapt_out[even,] <- as.numeric(hap2)
  #head(hapt_out[,1:10])
  write.table(cbind(rownames(hapt_out), hapt_out+1), paste0("chrom",i,".thap"), row.names=F, col.names=F, quote = FALSE)
}
#a<- read.table("chrom1.thap")

### Also need to convert map.inp
#Each line contains five columns corresponding to:
#1. the SNP name
#2. the SNP chromosome (or scaffold) of origin
#3. the SNP position on the chromosome (or scaffold). Note that it is up to the user to choose either
#physical or genetic map positions (if available).
#4. the SNP ancestral allele (as coded in the haplotype input file)
#5. the SNP derived alleles (as coded in the haplotype input file)
map <- data.frame(name=1:nrow(vcf_filt2), chrom=as.numeric(vcf_filt2@fix[,"CHROM"]), 
                  pos=as.numeric(vcf_filt2@fix[,"POS"]), anc=1, derived=2)
  # setting anc=0 and derived = 1 thinks missing data
head(map)
##   name chrom pos anc derived
## 1    1     1  43   1       2
## 2    2     1 104   1       2
## 3    3     1 160   1       2
## 4    4     1 204   1       2
## 5    5     1 214   1       2
## 6    6     1 275   1       2
which(duplicated(map$pos))
## integer(0)
write.table(map, "map.inp", row.names=F, col.names=F)

iHS Analysis

cnt=0
for(i in 1:length(nlociperchrom)){
 cnt=cnt+1
 tmp.hapfile=paste0("chrom",i,".thap")
 
 tmp.hap=data2haplohh(hap_file=tmp.hapfile, map_file="map.inp", chr.name=i,haplotype.in.columns=FALSE)
 
tmp.scan=scan_hh(tmp.hap,threads=4)
 
 if(cnt==1){wgscan=tmp.scan}else{wgscan=rbind(wgscan,tmp.scan)}
}
## Map file seems OK: 1294  SNPs declared for chromosome 1 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1294 SNPs
## Map file seems OK: 1314  SNPs declared for chromosome 2 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1314 SNPs
## Map file seems OK: 1253  SNPs declared for chromosome 3 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1253 SNPs
## Map file seems OK: 1250  SNPs declared for chromosome 4 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1250 SNPs
## Map file seems OK: 1313  SNPs declared for chromosome 5 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1313 SNPs
## Map file seems OK: 1147  SNPs declared for chromosome 6 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1147 SNPs
## Map file seems OK: 1275  SNPs declared for chromosome 7 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1275 SNPs
## Map file seems OK: 1227  SNPs declared for chromosome 8 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1227 SNPs
## Map file seems OK: 1250  SNPs declared for chromosome 9 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1250 SNPs
## Map file seems OK: 1253  SNPs declared for chromosome 10 
## Standard rehh input file assumed
## Discard Haplotype with less than  100 % of genotyped SNPs
## No haplotype discarded
## Discard SNPs genotyped on less than  100 % of haplotypes
## No SNP discarded
## Data consists of 2000 haplotypes and 1253 SNPs
dim(wgscan)
## [1] 12576     7
ihs=ihh2ihs(wgscan,minmaf=0.05,freqbin=0.05)
  head(ihs$iHS,25)
##    CHR POSITION        iHS -log10(p-value)
## 7    1      311         NA              NA
## 14   1      566         NA              NA
## 17   1      627         NA              NA
## 19   1      634         NA              NA
## 20   1      687         NA              NA
## 22   1      726         NA              NA
## 24   1      863         NA              NA
## 27   1      932         NA              NA
## 31   1     1120         NA              NA
## 32   1     1171         NA              NA
## 34   1     1216         NA              NA
## 35   1     1247         NA              NA
## 41   1     1468         NA              NA
## 46   1     1699         NA              NA
## 48   1     1811         NA              NA
## 49   1     1819         NA              NA
## 54   1     1946         NA              NA
## 59   1     2110         NA              NA
## 63   1     2228         NA              NA
## 66   1     2486         NA              NA
## 70   1     2675 -0.1092506      0.03952743
## 80   1     2945         NA              NA
## 81   1     2962         NA              NA
## 83   1     3018         NA              NA
## 84   1     3024         NA              NA
  tail(ihs$iHS)
##       CHR POSITION iHS -log10(p-value)
## 12554  10   499130  NA              NA
## 12561  10   499367  NA              NA
## 12567  10   499629  NA              NA
## 12568  10   499739  NA              NA
## 12569  10   499764  NA              NA
## 12576  10   499989  NA              NA
  ihs$frequency.class
##            Number of SNPs mean of the log(iHHA/iHHD) ratio
## 0.05 - 0.1             75                      0.951238483
## 0.1 - 0.15             76                      0.588511796
## 0.15 - 0.2             79                      0.421991438
## 0.2 - 0.25             76                      0.377900862
## 0.25 - 0.3            114                      0.190045269
## 0.3 - 0.35            147                      0.202779781
## 0.35 - 0.4             95                      0.005106286
## 0.4 - 0.45            111                     -0.069256982
## 0.45 - 0.5            114                     -0.140772122
## 0.5 - 0.55            157                     -0.177965029
## 0.55 - 0.6            189                     -0.255648542
## 0.6 - 0.65            158                     -0.279530058
## 0.65 - 0.7            248                     -0.404833732
## 0.7 - 0.75            284                     -0.523388652
## 0.75 - 0.8            296                     -0.558672530
## 0.8 - 0.85            411                     -0.743383397
## 0.85 - 0.9            579                     -0.939585453
## 0.9 - 0.95           1064                     -1.296392187
##            sd of the log(iHHA/iHHD) ratio
## 0.05 - 0.1                      0.5309461
## 0.1 - 0.15                      0.3936370
## 0.15 - 0.2                      0.4217720
## 0.2 - 0.25                      0.4080201
## 0.25 - 0.3                      0.3778977
## 0.3 - 0.35                      0.3972260
## 0.35 - 0.4                      0.4017517
## 0.4 - 0.45                      0.3992746
## 0.45 - 0.5                      0.4154989
## 0.5 - 0.55                      0.3704670
## 0.55 - 0.6                      0.4288702
## 0.6 - 0.65                      0.3961157
## 0.65 - 0.7                      0.3916281
## 0.7 - 0.75                      0.3976893
## 0.75 - 0.8                      0.4482664
## 0.8 - 0.85                      0.4605596
## 0.85 - 0.9                      0.4979967
## 0.9 - 0.95                      0.5904876
  #distribplot(ihs$iHS$iHS)

  #ihsplot(ihs)
  ihs$iHS[which(ihs$iHS[,4]>2),]
##       CHR POSITION       iHS -log10(p-value)
## 147     1     5169 -2.666785        2.115880
## 637     1    23403 -3.079055        2.682651
## 1416    2    54838  3.097942        2.710256
## 1949    2    75287  2.675152        2.126706
## 2024    2    77878  2.835195        2.339156
## 2262    2    86607  3.401843        3.174360
## 2267    2    86737 -3.373727        3.129843
## 2269    2    86755 -2.701493        2.160968
## 2286    2    87349 -2.977203        2.536270
## 2490    2    96094 -2.684031        2.138225
## 3114    3   119525 -2.610321        2.043557
## 3471    3   133910  2.584122        2.010428
## 6194    5   241386 -2.595876        2.025258
## 6856    6   269724 -2.733836        2.203418
## 7191    6   285093 -3.389856        3.155341
## 7949    7   315080  2.680613        2.133787
## 8089    7   320515 -2.609330        2.042298
## 8394    7   330460  2.885329        2.407818
## 9297    8   368887 -3.238545        2.920309
## 9318    8   369659 -2.885205        2.407647
## 9534    8   378729 -3.047222        2.636450
## 9547    8   379372 -2.641691        2.083581
## 9746    8   387622 -2.598707        2.028837
## 10559   9   420454  2.604928        2.036715
## 10638   9   423702  2.801077        2.293006
## 10639   9   423720  2.766193        2.246302
## 10724   9   427249  2.643252        2.085583
## 11221   9   446192  2.597399        2.027183
  plot(ihs$iHS$POSITION,ihs$iHS$`-log10(p-value)`, col="grey", pch=20, main="REHH iHS")
    plot_layers()

  plot(ihs$iHS$POSITION,ihs$iHS$iHS, col="grey", pch=20, main="REHH iHS")
    plot_layers()